### import needed pacakges
import pandas as pd
import plotly_express as px
import numpy as np
## read pickle file
df_kiva = pd.read_pickle('kiva_v1_after_preprocessing.pkl')
df_kiva.head(2)
| funded_amount | loan_amount | activity | sector | country_code | country | region | term_in_months | lender_count | repayment_interval | borrower_female | borrower_male | repayment_amount | deficit_amount | avg_lending_amount | loan_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 300.0 | 300.0 | Fruits & Vegetables | Food | PK | Pakistan | Lahore | 12.0 | 12 | irregular | 1 | 0 | 25.00 | 0.0 | 25.00 | Silver |
| 1 | 575.0 | 575.0 | Rickshaw | Transportation | PK | Pakistan | Lahore | 11.0 | 14 | irregular | 2 | 0 | 52.27 | 0.0 | 41.07 | Gold |
df_kiva.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 646617 entries, 0 to 671188 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 funded_amount 646617 non-null float64 1 loan_amount 646617 non-null float64 2 activity 646617 non-null object 3 sector 646617 non-null category 4 country_code 646617 non-null object 5 country 646617 non-null object 6 region 646617 non-null object 7 term_in_months 646617 non-null float64 8 lender_count 646617 non-null int64 9 repayment_interval 646617 non-null category 10 borrower_female 646617 non-null int64 11 borrower_male 646617 non-null int64 12 repayment_amount 646617 non-null float64 13 deficit_amount 646617 non-null float64 14 avg_lending_amount 646617 non-null float64 15 loan_category 646617 non-null category dtypes: category(3), float64(6), int64(3), object(4) memory usage: 70.9+ MB
## no of unique values in column activity
df_kiva.loc[:,'activity'].nunique()
163
## no of unique values in column sector
df_kiva.loc[:,'sector'].nunique()
15
## finding the no of projects in different sectors
df_sector = df_kiva.groupby(by=['sector'],as_index=False).agg(count=('sector','count'))
df_sector
| sector | count | |
|---|---|---|
| 0 | Agriculture | 174578 |
| 1 | Arts | 11938 |
| 2 | Clothing | 32547 |
| 3 | Construction | 6263 |
| 4 | Education | 30123 |
| 5 | Entertainment | 829 |
| 6 | Food | 135342 |
| 7 | Health | 9190 |
| 8 | Housing | 30153 |
| 9 | Manufacturing | 6204 |
| 10 | Personal Use | 25422 |
| 11 | Retail | 123167 |
| 12 | Services | 44838 |
| 13 | Transportation | 15389 |
| 14 | Wholesale | 634 |
## treemap in plotly express
treemap_fig1 = px.treemap(df_sector, path = ['sector'],values = 'count',
title = 'Treemap showing distribution of projects across sectors',
color_discrete_sequence = px.colors.sequential.Viridis)
treemap_fig1.update_layout(margin = dict(t=50, l=25, r=25, b=25))
treemap_fig1.update_layout(title_x=0.45)
treemap_fig1.show()
Observation
The Agriculture sector has the highest no of projects.
The Wholesale sector has lowest no of projects.
df_sector_1=df_kiva.groupby(by=['sector','activity']).agg(count=('activity','count')).reset_index()
df_sector_1
| sector | activity | count | |
|---|---|---|---|
| 0 | Agriculture | Adult Care | 0 |
| 1 | Agriculture | Agriculture | 26698 |
| 2 | Agriculture | Air Conditioning | 0 |
| 3 | Agriculture | Animal Sales | 9157 |
| 4 | Agriculture | Aquaculture | 108 |
| ... | ... | ... | ... |
| 2440 | Wholesale | Water Distribution | 0 |
| 2441 | Wholesale | Weaving | 0 |
| 2442 | Wholesale | Wedding Expenses | 0 |
| 2443 | Wholesale | Well digging | 0 |
| 2444 | Wholesale | Wholesale | 357 |
2445 rows × 3 columns
##treemap in plotly express
treemap_fig2 = px.treemap(df_sector_1, path=['sector','activity'], values='count',
title = 'Treemap showing distribution of projects across sectors and activities',
color_discrete_sequence = px.colors.sequential.Viridis)
treemap_fig2.update_layout(margin = dict(t=50, l=25, r=25, b=25))
treemap_fig2.update_layout(title_x=0.45)
treemap_fig2.show()
Observation
There are various activities under each sector.
Farming is the largest acitivity in all acitivities.
## finding the no of projects in different countries
df_country = df_kiva.groupby(by=['country'],as_index=False).agg(count=('country','count'))
df_country
| country | count | |
|---|---|---|
| 0 | Afghanistan | 2 |
| 1 | Albania | 1934 |
| 2 | Armenia | 8629 |
| 3 | Azerbaijan | 1942 |
| 4 | Belize | 124 |
| ... | ... | ... |
| 82 | Vietnam | 9532 |
| 83 | Virgin Islands | 2 |
| 84 | Yemen | 2313 |
| 85 | Zambia | 775 |
| 86 | Zimbabwe | 3973 |
87 rows × 2 columns
##treemap in plotly express
treemap_fig3 = px.treemap(df_country, path=['country'],values='count',
title = 'Treemap showing distribution of projects across countries',
color_discrete_sequence = px.colors.sequential.Plasma)
treemap_fig3.update_layout(margin = dict(t=50, l=25, r=25, b=25))
treemap_fig3.update_layout(title_x=0.45)
treemap_fig3.show()
Observation
Philippines has the most number of projects, it is expected that they also have the highest value of total funds, we will find out in further analysis.
The treemap gets crowded for smaller no of projects as there are in total 87 countries,but the aim here is to get an overview of the number of projects with respect to countries.
## finding distribution of projects across countries and sectors
df_country_sector = df_kiva.groupby(by=['country','sector']).agg(count=('sector','count')).reset_index()
df_country_sector
| country | sector | count | |
|---|---|---|---|
| 0 | Afghanistan | Agriculture | 0 |
| 1 | Afghanistan | Arts | 2 |
| 2 | Afghanistan | Clothing | 0 |
| 3 | Afghanistan | Construction | 0 |
| 4 | Afghanistan | Education | 0 |
| ... | ... | ... | ... |
| 1300 | Zimbabwe | Personal Use | 1 |
| 1301 | Zimbabwe | Retail | 765 |
| 1302 | Zimbabwe | Services | 230 |
| 1303 | Zimbabwe | Transportation | 9 |
| 1304 | Zimbabwe | Wholesale | 16 |
1305 rows × 3 columns
## treemap in plotly express
treemap_fig4 = px.treemap(df_country_sector, path=['country','sector'],values='count',
title = 'Treemap showing distribution of projects across countries and sectors',
color_discrete_sequence = px.colors.sequential.Plasma)
treemap_fig4.update_layout(margin = dict(t=50, l=25, r=25, b=25))
treemap_fig4.update_layout(title_x=0.45)
treemap_fig4.show()
Observation
We can observe that agriculture is among the top three activities in most of the countries.
##finding distribution of funds across sectors
df_funded_amount = df_kiva.groupby(by=['sector'],as_index=False).agg(Total_funds=('funded_amount','sum'))
df_funded_amount
| sector | Total_funds | |
|---|---|---|
| 0 | Agriculture | 132657485.0 |
| 1 | Arts | 11778255.0 |
| 2 | Clothing | 34748620.0 |
| 3 | Construction | 6320365.0 |
| 4 | Education | 29699895.0 |
| 5 | Entertainment | 1023095.0 |
| 6 | Food | 114765260.0 |
| 7 | Health | 9159825.0 |
| 8 | Housing | 20189925.0 |
| 9 | Manufacturing | 5355900.0 |
| 10 | Personal Use | 12675775.0 |
| 11 | Retail | 90518655.0 |
| 12 | Services | 42878810.0 |
| 13 | Transportation | 9805025.0 |
| 14 | Wholesale | 918900.0 |
##treemap in plotly express
treemap_fig5 = px.treemap(df_funded_amount, path = ['sector'],values = 'Total_funds',
title = 'Treemap showing distribution of funds across sectors',
color_discrete_sequence = px.colors.sequential.Inferno)
treemap_fig5.update_layout(margin = dict(t=50, l=25, r=25, b=25))
treemap_fig5.update_layout(title_x=0.45)
treemap_fig5.show()
Observation
Agriculture has the highest value of total funds recieved, this is in line with fig1, as it had the highest no of projects.
##finding distribution of funds across countries
df_funded_amount1 = df_kiva.groupby(by=['country'],as_index=False).agg(Total_funds=('funded_amount','sum'))
df_funded_amount1
| country | Total_funds | |
|---|---|---|
| 0 | Afghanistan | 14000.0 |
| 1 | Albania | 2490000.0 |
| 2 | Armenia | 11184350.0 |
| 3 | Azerbaijan | 2697225.0 |
| 4 | Belize | 113500.0 |
| ... | ... | ... |
| 82 | Vietnam | 12927575.0 |
| 83 | Virgin Islands | 0.0 |
| 84 | Yemen | 1784075.0 |
| 85 | Zambia | 1142350.0 |
| 86 | Zimbabwe | 3341225.0 |
87 rows × 2 columns
##treemap in plotly express
treemap_fig6 = px.treemap(df_funded_amount1, path = ['country'],values = 'Total_funds',
title = 'Treemap showing distribution of funds across countries',
color_discrete_sequence = px.colors.sequential.Inferno)
treemap_fig6.update_layout(margin = dict(t=50, l=25, r=25, b=25))
treemap_fig6.update_layout(title_x=0.45)
treemap_fig6.show()
Observation
Phillipines had the highest no of projects and it also has highest value of total funds recieved.
Lets plot the data we have on world map to find out the reach of kiva.
## Number of projects in different countries and total funded amount
df_country1 = df_kiva.groupby(by=['country'],as_index=False).agg(Total_funds=('funded_amount','sum'),
count=('country','count'))
df_country1
| country | Total_funds | count | |
|---|---|---|---|
| 0 | Afghanistan | 14000.0 | 2 |
| 1 | Albania | 2490000.0 | 1934 |
| 2 | Armenia | 11184350.0 | 8629 |
| 3 | Azerbaijan | 2697225.0 | 1942 |
| 4 | Belize | 113500.0 | 124 |
| ... | ... | ... | ... |
| 82 | Vietnam | 12927575.0 | 9532 |
| 83 | Virgin Islands | 0.0 | 2 |
| 84 | Yemen | 1784075.0 | 2313 |
| 85 | Zambia | 1142350.0 | 775 |
| 86 | Zimbabwe | 3341225.0 | 3973 |
87 rows × 3 columns
## geo scatter plot
geo_fig7 = px.scatter_geo(df_country1,
locations="country",
locationmode='country names',
size='count',
title='Map showing no of projects in different countries')
geo_fig7.update_layout(title_x=0.45)
geo_fig7.show()
Observation
We can see that the kiva supported projects are mostly in the southern hemisphere,in developing countries. There are only a few projects in the USA.
Lets also plot the total value of funds in each countries.
## geo scatter plot
geo_fig8 = px.scatter_geo(df_country1,
locations="country",
locationmode='country names',
size='Total_funds',
title='Map showing funds in different countries'
)
geo_fig8.update_layout(title_x=0.45)
geo_fig8.show()
Observations
Again as we observed in treemap we see that Philippines has the highest value of funds. The south american continent has also together large amounts of funds. But we need to check it in comparison with total of african continent.
Lets find out the number of projects lead by men and women.
## number of projects lead by women.
df_female=df_kiva.loc[(df_kiva.loc[:,'borrower_female']>=1) & (df_kiva.loc[:,'borrower_male']==0),:].shape
df_female
(470903, 16)
## number of projects lead by men
df_male=df_kiva.loc[(df_kiva.loc[:,'borrower_female']==0) & (df_kiva.loc[:,'borrower_male']>=1),:].shape
df_male
(133307, 16)
## number of projects lead by both men and women together
df_together=df_kiva.loc[(df_kiva.loc[:,'borrower_female']>=1) & (df_kiva.loc[:,'borrower_male']>=1),:].shape
df_together
(42407, 16)
#df_female[0]
## Creating a new Dataframe
df_gender=pd.DataFrame({'Gender':['Female','Male','Together'],
'No_of_projects':[df_female[0],df_male[0],df_together[0]]})
df_gender
| Gender | No_of_projects | |
|---|---|---|
| 0 | Female | 470903 |
| 1 | Male | 133307 |
| 2 | Together | 42407 |
## bar plot
bar_fig9=px.bar(df_gender,
x='Gender',
y='No_of_projects',
color='Gender',
color_discrete_map={'Female':'Deeppink','Male':'Lightblue','Together':'Purple'},
template='simple_white',
title='Distribution of Projects among Men and Women')
bar_fig9.update_layout(title_x=0.45)
bar_fig9.show()
## grouping with respect to repayment interval
df_repayment = df_kiva.groupby(by=['repayment_interval'],as_index=False).agg(Count=('repayment_interval','count'),repayment_avg=('repayment_amount','mean')).sort_values(by='Count',ascending=False)
df_repayment
| repayment_interval | Count | repayment_avg | |
|---|---|---|---|
| 2 | monthly | 330282 | 64.960015 |
| 1 | irregular | 250016 | 84.805957 |
| 0 | bullet | 65721 | 63.045885 |
| 3 | weekly | 598 | 59.207040 |
pie_fig10= px.pie( df_repayment,
values = 'Count',
labels = 'repayment_interval',
names = 'repayment_interval',
color_discrete_sequence = px.colors.sequential.RdBu)
pie_fig10.show()
Observation
The most popular method of repayment is monthly,the coressponding value of average repayment per month is 64.96
df_category=df_kiva.groupby(by=['loan_category'],as_index=False).agg(Count=('loan_category','count'),Total_amount=('loan_amount','sum')).sort_values(by='Total_amount',ascending=False)
df_category
| loan_category | Count | Total_amount | |
|---|---|---|---|
| 3 | Platinum | 149593 | 340582075.0 |
| 2 | Gold | 161016 | 119558050.0 |
| 1 | Silver | 192399 | 73665950.0 |
| 0 | Bronze | 143609 | 26460925.0 |
total_count=df_category['Count'].sum()
total_count
total_funds=df_category['Total_amount'].sum()
total_funds
560267000.0
df_category['count_per'] = df_category['Count'] / total_count * 100
df_category['funds_per'] = df_category['Total_amount']/total_funds*100
df_category
| loan_category | Count | Total_amount | count_per | funds_per | |
|---|---|---|---|---|---|
| 3 | Platinum | 149593 | 340582075.0 | 23.134715 | 60.789244 |
| 2 | Gold | 161016 | 119558050.0 | 24.901294 | 21.339477 |
| 1 | Silver | 192399 | 73665950.0 | 29.754708 | 13.148365 |
| 0 | Bronze | 143609 | 26460925.0 | 22.209283 | 4.722913 |
##bar plot
bar_fig11=px.bar(df_category,
x='loan_category',
y=['funds_per','count_per'],
labels={'loan_category':'Project Category','y':'Value'},
barmode='group')
bar_fig11.show()
The highest total of funds is found in the country of philippines and the largest sector of investment is agriculture. It ll be intresting to see are any of those projects from platinum category fall.
df_philippines=df_kiva.loc[(df_kiva.loc[:,'loan_category']== 'Platinum') & (df_kiva.loc[:,'country']=='Philippines'),:]
df_philippines
| funded_amount | loan_amount | activity | sector | country_code | country | region | term_in_months | lender_count | repayment_interval | borrower_female | borrower_male | repayment_amount | deficit_amount | avg_lending_amount | loan_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 124 | 1150.0 | 1150.0 | Personal Housing Expenses | Housing | PH | Philippines | Tudela-Tinacla-an, Clarin, Misamis Occidental | 25.0 | 22 | irregular | 1 | 0 | 46.00 | 0.0 | 52.27 | Platinum |
| 144 | 1025.0 | 1025.0 | Retail | Retail | PH | Philippines | Liloy- Baybay Liloy, Zamboanga del Norte | 26.0 | 28 | monthly | 1 | 0 | 39.42 | 0.0 | 36.61 | Platinum |
| 943 | 1350.0 | 1350.0 | Cosmetics Sales | Retail | PH | Philippines | La Victoria, Aurora, Zamboanga del Sur | 20.0 | 51 | monthly | 1 | 0 | 67.50 | 0.0 | 26.47 | Platinum |
| 957 | 1350.0 | 1350.0 | Clothing Sales | Clothing | PH | Philippines | Bacoor, Cavite | 8.0 | 36 | irregular | 1 | 0 | 168.75 | 0.0 | 37.50 | Platinum |
| 963 | 1125.0 | 1125.0 | Cereals | Food | PH | Philippines | Binalbagan, Negros Occidental | 14.0 | 39 | irregular | 1 | 0 | 80.36 | 0.0 | 28.85 | Platinum |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 668800 | 1200.0 | 1200.0 | General Store | Retail | PH | Philippines | Ngileb Alfonso Lista, Ifugao | 8.0 | 13 | irregular | 1 | 0 | 150.00 | 0.0 | 92.31 | Platinum |
| 669090 | 1400.0 | 1400.0 | Fish Selling | Food | PH | Philippines | Brgy. Villa Coloma, Bagabag,Nueva Vizcaya | 8.0 | 15 | irregular | 1 | 0 | 175.00 | 0.0 | 93.33 | Platinum |
| 669437 | 50.0 | 1100.0 | General Store | Retail | PH | Philippines | Capirpiriwan Cordon, Isabela | 13.0 | 2 | irregular | 1 | 0 | 3.85 | 1050.0 | 25.00 | Platinum |
| 669890 | 1400.0 | 1400.0 | Transportation | Transportation | PH | Philippines | Ueg, San Mariano, Isabela | 8.0 | 11 | irregular | 1 | 0 | 175.00 | 0.0 | 127.27 | Platinum |
| 670094 | 900.0 | 1400.0 | Farming | Agriculture | PH | Philippines | La Salette, Benito Soliven, Isabela | 8.0 | 15 | bullet | 1 | 0 | 112.50 | 500.0 | 60.00 | Platinum |
4501 rows × 16 columns
df_philsec= df_philippines.groupby(by=['sector'],as_index=False).agg(Count=('sector','count')).sort_values(by='Count',ascending=False)
df_philsec
| sector | Count | |
|---|---|---|
| 11 | Retail | 1614 |
| 6 | Food | 942 |
| 0 | Agriculture | 795 |
| 12 | Services | 303 |
| 8 | Housing | 233 |
| 13 | Transportation | 186 |
| 2 | Clothing | 139 |
| 4 | Education | 68 |
| 9 | Manufacturing | 57 |
| 7 | Health | 51 |
| 3 | Construction | 50 |
| 10 | Personal Use | 24 |
| 14 | Wholesale | 24 |
| 1 | Arts | 8 |
| 5 | Entertainment | 7 |
Observations
There are 4501 projects in platinum category in philippines. There are 795 projects in agriculture sector in philippines